import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from umap.umap_ import UMAP
import plotly.express as px
# PCA
#from sklearn.decomposition import PCA
# Library for t-SNE projections
#from sklearn.manifold import TSNE
2022-09-26 22:27:27.024697: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Load metadata
meta = pd.read_csv('lgg_lb_meta.csv')
meta = meta.set_index(['SDG_ID'])
#meta
# Load gene expression matrix
df = pd.read_csv('lb_plasma_matrix.csv')
tdf = df.T
tdf.columns = tdf.iloc[0]
tdf = tdf[1:]
#tdf
# Merge metadata and gene expression matrix
main_df = pd.concat([meta, tdf], axis=1, join="inner")
#main_df
# Store Sample IDs as a list
sample_ids = main_df.index.to_list()
short_histology_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Tumor_Subtype', 'Relapse', 'Survival_Status'],axis=1)
#short_histology_df
# Store histologies
hist = short_histology_df['Short_Histology'].to_list()
# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
#umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
#proj_3d = umap_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
#umap_2d_df
ufig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', title='UMAP Plot of Short Histology', color=hist, hover_data=['Sample_ID'])
#ufig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)
# ufig_2d.update_xaxes(title_text='UMAP1')
# ufig_2d.update_yaxes(title_text='UMAP2')
ufig_2d.show()
#ufig_3d.show()
# Plot t-SNE
#tsne_2d = TSNE(n_components=2, init='random', random_state=0, learning_rate='auto')
#tsne_3d = TSNE(n_components=3, init='random', random_state=0, learning_rate='auto')
#proj_2d = tsne_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
#proj_3d = tsne_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
#tfig_2d = px.scatter(proj_2d, x=0, y=1, title='t-SNE Plot of Short Histology', color=hist)
#tfig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)
#tfig_2d.show()
#tfig_3d.show()
# Perform PCA
#pca2 = PCA(n_components=2)
#pca3 = PCA(n_components=3)
#df_pca2 = pca2.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
#df_pca3 = pca3.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
# Display PCA plot
#pca_2d = px.scatter(df_pca2, x=0, y=1, title='PCA Plot of Short Histology', color=hist)
# Display 3D PCA plot
#pca_3d = px.scatter_3d(df_pca3, x=0, y=1, z=2, color=hist)
#pca_2d.show()
#pca_3d.show()
# Split the dataset into features and labels
sh_X = short_histology_df.loc[:, short_histology_df.columns != 'Short_Histology'].values
sh_y = short_histology_df.loc[:, short_histology_df.columns == 'Short_Histology'].values.ravel()
# Split data into training and testing set
sh_X_train, sh_X_test, sh_y_train, sh_y_test = train_test_split(sh_X, sh_y, test_size=0.25, random_state=42)
#Sanity check
print(sh_X_train.shape, sh_X_test.shape, sh_y_train.shape, sh_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
# Histology Class Histogram
fig = px.histogram(short_histology_df, x='Short_Histology', title='Histology Classes')
fig.update_xaxes(title_text='Histology')
fig.update_yaxes(title_text='Number of Samples')
fig.show()
# Initialize random forest classifier
sh_rf = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
sh_rf.fit(sh_X_train, sh_y_train)
# Make predictions using random forest classifier
sh_rf_y_pred = sh_rf.predict(sh_X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(sh_y_test, sh_rf_y_pred)}')
Accuracy: 0.6
# Calculate a confusion matrix
sh_cm = confusion_matrix(sh_y_test, sh_rf_y_pred, labels=sh_rf.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(sh_cm, text_auto=True,
labels=dict(x="Predicted Subtype", y="True Subtype", color="Productivity"),
x=short_histology_df['Short_Histology'].unique().tolist(),
y=short_histology_df['Short_Histology'].unique().tolist(),
title='Histology Classification Accuracy'
)
disp.show()
# What are the most important features?
sh_rf_feature_list = short_histology_df.columns
sh_rf_feature_list = sh_rf_feature_list.drop('Short_Histology')
sh_rf_imp_features = pd.Series(sh_rf.feature_importances_, index=sh_rf_feature_list)
sh_rf_imp_genes = sh_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
sh_rf_imp_genes.columns = ["features", "importance"]
sh_rf_imp_genes_fil = sh_rf_imp_genes[~(sh_rf_imp_genes == 0.000000).any(axis=1)]
sh_rf_imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-4685-3p | 0.030000 |
| 1 | miR-374b-3p | 0.030000 |
| 2 | miR-4727-5p | 0.020000 |
| 3 | miR-6722-5p | 0.020000 |
| 4 | miR-6798-5p | 0.020000 |
| ... | ... | ... |
| 94 | miR-223-5p | 0.002857 |
| 95 | miR-17-3p | 0.002857 |
| 96 | miR-181b-5p | 0.002857 |
| 97 | miR-6872-3p | 0.002857 |
| 98 | miR-6078 | 0.002857 |
99 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(sh_rf_imp_genes_fil, x=sh_rf_imp_genes_fil.features, y=sh_rf_imp_genes_fil.importance)
fig.show()
relapse_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Short_Histology', 'Tumor_Subtype', 'Survival_Status'],axis=1)
#relapse_df
# Store relapse data
relapse = relapse_df['Relapse'].to_list()
# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
#umap_2d_df
ufig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2', title='UMAP Plot of Relapse', color=relapse, hover_data=['Sample_ID'])
ufig_2d.show()
# Split the dataset into features and labels
r_X = relapse_df.loc[:, relapse_df.columns != 'Relapse'].values
r_y = relapse_df.loc[:, relapse_df.columns == 'Relapse'].values.ravel()
# Split data into training and testing set
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(r_X, r_y, test_size=0.25, random_state=42)
#Sanity check
print(r_X_train.shape, r_X_test.shape, r_y_train.shape, r_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
# Relapse Classes Histogram
fig = px.histogram(relapse_df, x='Relapse', title='Relapse Classes')
fig.update_xaxes(title_text='Relapse?')
fig.update_yaxes(title_text='Number of Samples')
fig.show()
# Initialize random forest classifier
r_rf = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
r_rf.fit(r_X_train, r_y_train)
# Make predictions using random forest classifier
r_rf_y_pred = r_rf.predict(r_X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(r_y_test, r_rf_y_pred)}')
Accuracy: 0.8
# Calculate a confusion matrix
r_cm = confusion_matrix(r_y_test, r_rf_y_pred, labels=r_rf.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(r_cm, text_auto=True,
labels=dict(x="Predicted Relapse", y="True Relapse", color="Productivity"),
x=relapse_df['Relapse'].unique().tolist(),
y=relapse_df['Relapse'].unique().tolist()
)
disp.show()
print(r_X_test)
print('True Relapse Label:')
print(r_y_test)
print('Predicted Relapse:')
print(r_rf_y_pred)
[[16 16 22 ... 891 75 290] [35 60 49 ... 1876 78 662] [12 68 43 ... 1020 174 370] [45 14 36 ... 8387 128 1737] [39 30 21 ... 1508 64 254]] True Relapse Label: ['Yes' 'Yes' 'No' 'Yes' 'No'] Predicted Relapse: ['Yes' 'Yes' 'No' 'Yes' 'Yes']
The ML algorithm misclassified patient 15635-101 as having a relapse, but this patient did not at the time of plasma collection. This patient should be investigated further for relapse.
from sklearn import tree
fn = relapse_df.loc[:, relapse_df.columns != 'Relapse'].columns.to_list()
cn = r_rf.classes_.tolist()
tree.plot_tree(r_rf.estimators_[0], feature_names = fn, class_names=cn, filled = True)
[Text(0.5, 0.75, 'miR-873-5p <= 1924.0\ngini = 0.278\nsamples = 6\nvalue = [2, 10]\nclass = Yes'), Text(0.25, 0.25, 'gini = 0.0\nsamples = 5\nvalue = [0, 10]\nclass = Yes'), Text(0.75, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [2, 0]\nclass = No')]
This is Decision Tree #1 out of 100 total decision trees within the Random Forest Classifier.
# What are the most important features?
r_rf_feature_list = relapse_df.columns
r_rf_feature_list = r_rf_feature_list.drop('Relapse')
r_rf_imp_features = pd.Series(r_rf.feature_importances_, index=r_rf_feature_list)
r_rf_imp_genes = r_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
r_rf_imp_genes.columns = ["features", "importance"]
r_rf_imp_genes_fil = r_rf_imp_genes[~(r_rf_imp_genes == 0.000000).any(axis=1)]
r_rf_imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-6506-3p | 0.030303 |
| 1 | miR-502-5p | 0.020202 |
| 2 | miR-4677-5p | 0.020202 |
| 3 | miR-19a-5p | 0.020202 |
| 4 | miR-887-5p | 0.020202 |
| ... | ... | ... |
| 91 | miR-7106-3p | 0.006734 |
| 92 | miR-1976 | 0.006734 |
| 93 | miR-7157-3p | 0.003367 |
| 94 | miR-4636 | 0.003367 |
| 95 | miR-505-3p | 0.002886 |
96 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(r_rf_imp_genes_fil, x=r_rf_imp_genes_fil.features, y=r_rf_imp_genes_fil.importance)
fig.show()
# Create a dataframe containing only ML selected genes
ml_hist_df = short_histology_df.filter(sh_rf_imp_genes_fil['features'].to_list()[0:1])
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_hist_df)
proj_3d = umap_3d.fit_transform(ml_hist_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using genes found by Random Forest Algorithm (Histology)',
color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using genes found by Random Forest Algorithm (Histology)',
color=hist, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
print('Name of the single miRNA: ', sh_rf_imp_genes_fil['features'].to_list()[0:1][0])
Name of the single miRNA: miR-4685-3p
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list())
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot for using all genes found by Random Forest Algorithm (Relapse)',
color=relapse, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using all genes found by Random Forest Algorithm (Relapse)',
color=relapse, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list()[0:1]) # Also [0:10] (ten genes)
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using most important gene found by Random Forest Algorithm (Relapse)',
color=relapse, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using most important gene found by Random Forest Algorithm (Relapse)',
color=relapse, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
# The most important gene determined by ML algorithm
print('Name of the single miRNA: ', r_rf_imp_genes_fil['features'].to_list()[0:1][0])
Name of the single miRNA: miR-6506-3p
Interestingly, patient 15635-101 is being clustered with the relapsed patients based on just miRNA signature.
# Add "hsa-" to all miRNA genes
hist_genes = sh_rf_imp_genes_fil['features'].to_list()
relapse_genes = r_rf_imp_genes_fil['features'].to_list()
hist_genes = ["hsa-" + mirnas for mirnas in hist_genes]
relapse_genes = ["hsa-" + mirnas for mirnas in relapse_genes]
# Export miRNA genes important for distinguishing LGG and HGG brain tumors
with open("LGG_miRNAs.txt", 'w') as file:
for row in hist_genes:
s = "".join(map(str, row))
file.write(s+'\n')
# Export miRNA genes important for relapse in LGG/HGG brain tumors
with open("LGG_relapse_miRNAs.txt", 'w') as file:
for row in relapse_genes:
s = "".join(map(str, row))
file.write(s+'\n')
dip_test_genes = ['miR-339-5p', 'miR-1299', 'let-7g-5p', 'miR-145-3p', 'miR-6752-5p', 'miR-708-5p',
'miR-3679-3p','miR-448', 'miR-548ag', 'miR-4796-5p', 'miR-6844', 'miR-5697', 'miR-4765',
'miR-8076', 'miR-1278', 'miR-3606-5p', 'miR-4424', 'miR-2114-3p', 'miR-4677-5p', 'miR-644a',
'miR-6739-3p']
print('Number of miRNA genes found for histology: ', len(sh_rf_imp_genes_fil['features'].to_list()), 'genes')
print('Number of miRNA genes found for relapse: ', len(r_rf_imp_genes_fil['features'].to_list()), 'genes')
print('Number of miRNA genes found using dip test: ', len(dip_test_genes), 'genes')
Number of miRNA genes found for histology: 99 genes Number of miRNA genes found for relapse: 96 genes Number of miRNA genes found using dip test: 21 genes
# Create a dataframe containing only dip test genes
dip_histology_df = short_histology_df[dip_test_genes]
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(dip_histology_df)
proj_3d = umap_3d.fit_transform(dip_histology_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using Dip Test genes (Histology)',
color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using Dip Test genes (Histology)',
color=hist, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
histology_cols = short_histology_df.filter(sh_rf_imp_genes_fil['features'].to_list())
cols = list(set(histology_cols) & set(dip_test_genes))
#print(len(dip_test_genes))
#print(len(cols))
#list(set(cols) ^ set(dip_test_genes))
print('List of Dip Test genes that are also found from ML model for Histology: ')
cols
List of Dip Test genes that are also found from ML model for Histology:
['miR-339-5p']
# Create a dataframe containing only dip test genes
dip_relapse_df = relapse_df[dip_test_genes]
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(dip_relapse_df)
proj_3d = umap_3d.fit_transform(dip_relapse_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using Dip Test genes (Relapse)',
color=relapse, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using Dip Test genes (Relapse)',
color=relapse, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
relapse_cols = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list())
cols = list(set(relapse_cols) & set(dip_test_genes))
#print(len(dip_test_genes))
#print(len(cols))
#list(set(cols) ^ set(dip_test_genes))
print('List of Dip Test genes that are also found from ML model (Relapse): ')
cols
List of Dip Test genes that are also found from ML model (Relapse):
['miR-3606-5p', 'miR-4677-5p', 'miR-644a']
Check if the miRNA genes found are differentially expressed between LGG and HGG patients, and between relapse and non-relapse patients.
hdf_de = pd.read_csv('lb_plasma_histology_DE.csv')
hdf_de = hdf_de.dropna() # Left with 756 miRNA genes
# Save the DE genes to a list
de_hist_genes = hdf_de[hdf_de['padj']<0.05]['Unnamed: 0'].tolist()
# DE genes
hdf_de[hdf_de['padj']<0.05] # 36 significant DE miRNA genes
| Unnamed: 0 | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
|---|---|---|---|---|---|---|---|
| 0 | miR-9-3p | 369.895025 | -3.545678 | 0.547516 | -6.475941 | 9.422266e-11 | 7.132655e-08 |
| 1 | miR-215-5p | 776.272909 | -3.879022 | 0.705960 | -5.494677 | 3.914255e-08 | 1.481545e-05 |
| 2 | miR-375 | 3138.874448 | -3.055984 | 0.660167 | -4.629108 | 3.672441e-06 | 9.266793e-04 |
| 3 | miR-194-5p | 1887.090486 | -2.689264 | 0.625226 | -4.301267 | 1.698240e-05 | 3.213920e-03 |
| 4 | miR-4665-5p | 1233.004873 | 1.470567 | 0.357956 | 4.108238 | 3.986887e-05 | 5.836417e-03 |
| 5 | miR-4667-5p | 3034.388653 | 1.513389 | 0.371497 | 4.073763 | 4.625958e-05 | 5.836417e-03 |
| 6 | miR-339-3p | 19679.511317 | 2.426121 | 0.603150 | 4.022416 | 5.760415e-05 | 6.229477e-03 |
| 7 | miR-873-5p | 584.103051 | 1.479436 | 0.371861 | 3.978464 | 6.936194e-05 | 6.563374e-03 |
| 8 | miR-4473 | 381.880103 | 2.407600 | 0.630612 | 3.817878 | 1.346045e-04 | 1.132174e-02 |
| 9 | miR-145-5p | 8835.449194 | -2.200134 | 0.597729 | -3.680820 | 2.324850e-04 | 1.599920e-02 |
| 10 | miR-192-5p | 3898.417422 | -2.339416 | 0.633765 | -3.691301 | 2.231097e-04 | 1.599920e-02 |
| 11 | miR-671-5p | 7550.662349 | 1.754070 | 0.479567 | 3.657613 | 2.545752e-04 | 1.605945e-02 |
| 12 | miR-4447 | 579.198702 | 1.294593 | 0.368484 | 3.513297 | 4.425832e-04 | 2.435942e-02 |
| 13 | miR-4685-3p | 676.896195 | 1.408975 | 0.403649 | 3.490599 | 4.819391e-04 | 2.435942e-02 |
| 14 | miR-6788-3p | 1898.281023 | 1.677635 | 0.480672 | 3.490187 | 4.826834e-04 | 2.435942e-02 |
| 15 | miR-1233-3p | 12734.713006 | 1.438187 | 0.422884 | 3.400904 | 6.716331e-04 | 2.824590e-02 |
| 16 | miR-200c-3p | 211.510912 | -1.746093 | 0.510905 | -3.417649 | 6.316462e-04 | 2.824590e-02 |
| 17 | miR-550a-5p | 268.518711 | 0.856283 | 0.251547 | 3.404062 | 6.639163e-04 | 2.824590e-02 |
| 18 | miR-4706 | 2351.532760 | 1.095132 | 0.328881 | 3.329876 | 8.688460e-04 | 3.461665e-02 |
| 19 | miR-101-5p | 212.197838 | -0.814384 | 0.248808 | -3.273143 | 1.063587e-03 | 3.918903e-02 |
| 20 | miR-4522 | 1546.492458 | 1.310731 | 0.401210 | 3.266945 | 1.087146e-03 | 3.918903e-02 |
| 21 | miR-6852-3p | 6087.991997 | 1.532035 | 0.471781 | 3.247343 | 1.164880e-03 | 4.008245e-02 |
| 22 | miR-6723-5p | 485.680863 | 1.870328 | 0.580710 | 3.220759 | 1.278517e-03 | 4.207989e-02 |
| 23 | miR-143-3p | 7497.934558 | -1.533629 | 0.479971 | -3.195252 | 1.397089e-03 | 4.406652e-02 |
| 24 | miR-4727-5p | 528.776095 | 0.904792 | 0.285032 | 3.174353 | 1.501707e-03 | 4.547168e-02 |
| 25 | miR-4519 | 811.173449 | 1.073130 | 0.342321 | 3.134866 | 1.719328e-03 | 4.820485e-02 |
| 26 | miR-8069 | 955.107411 | 0.942425 | 0.299746 | 3.144079 | 1.666105e-03 | 4.820485e-02 |
| 27 | miR-7109-5p | 310.853763 | 0.903019 | 0.289804 | 3.115967 | 1.833428e-03 | 4.956804e-02 |
| 28 | miR-1287-5p | 21170.276376 | 1.411937 | 0.463423 | 3.046758 | 2.313240e-03 | 4.959436e-02 |
| 29 | miR-200a-3p | 195.398108 | -0.998384 | 0.322769 | -3.093183 | 1.980217e-03 | 4.959436e-02 |
| 30 | miR-3157-5p | 4207.979636 | 1.227023 | 0.395784 | 3.100232 | 1.933693e-03 | 4.959436e-02 |
| 31 | miR-3943 | 4022.197686 | 1.450824 | 0.472285 | 3.071927 | 2.126820e-03 | 4.959436e-02 |
| 32 | miR-6080 | 632.156671 | 1.468667 | 0.484282 | 3.032667 | 2.424031e-03 | 4.959436e-02 |
| 33 | miR-6088 | 440.890616 | 0.736990 | 0.240070 | 3.069893 | 2.141353e-03 | 4.959436e-02 |
| 34 | miR-6869-5p | 903.322606 | 0.967520 | 0.318780 | 3.035067 | 2.404824e-03 | 4.959436e-02 |
| 35 | miR-7150 | 1599.792604 | 1.350058 | 0.443170 | 3.046368 | 2.316243e-03 | 4.959436e-02 |
| 36 | miR-96-5p | 218.531065 | -1.353603 | 0.445418 | -3.038950 | 2.374044e-03 | 4.959436e-02 |
# None of the dip test genes were differentially expressed
print('Number of dip test and DE genes that overlap (Histology):')
list(set(de_hist_genes) & set(dip_test_genes))
Number of dip test and DE genes that overlap (Histology):
[]
# 16 ML found genes for short histology were differentially expressed significantly
sh_overlap = list(set(de_hist_genes) & set(sh_rf_imp_genes_fil['features'].to_list()))
print('Number of ML and DE genes that overlap (Histology):')
sh_overlap
Number of ML and DE genes that overlap (Histology):
['miR-873-5p', 'miR-7150', 'miR-550a-5p', 'miR-3157-5p', 'miR-6080', 'miR-4685-3p', 'miR-200a-3p', 'miR-1233-3p', 'miR-3943', 'miR-6852-3p', 'miR-6088', 'miR-4667-5p', 'miR-4727-5p', 'miR-8069', 'miR-194-5p', 'miR-96-5p']
# Most important gene determined by ML for Histology
print('Most important gene found by ML (Histology):', sh_rf_imp_genes_fil['features'].to_list()[0:1][0])
Most important gene found by ML (Histology): miR-4685-3p
rdf_de = pd.read_csv('lb_plasma_relapse_DE.csv')
rdf_de = rdf_de.dropna() # Left with 1096 miRNA genes
# Store DE genes as a list
de_relapse_genes = rdf_de[rdf_de['padj']<0.05]['Unnamed: 0'].tolist()
rdf_de[rdf_de['padj']<0.05] # 159 significant DE miRNA genes
| Unnamed: 0 | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
|---|---|---|---|---|---|---|---|
| 0 | miR-148a-3p | 2627.979100 | 2.515368 | 0.492092 | 5.111581 | 3.194742e-07 | 0.000175 |
| 1 | miR-194-5p | 883.425019 | 2.034687 | 0.397943 | 5.113015 | 3.170576e-07 | 0.000175 |
| 2 | miR-19b-3p | 50281.482652 | 1.821879 | 0.368146 | 4.948793 | 7.467513e-07 | 0.000273 |
| 3 | miR-17-5p | 15476.879263 | 1.591049 | 0.334409 | 4.757790 | 1.957240e-06 | 0.000429 |
| 4 | miR-6080 | 1838.423762 | -3.425195 | 0.715219 | -4.789017 | 1.675999e-06 | 0.000429 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 154 | miR-3124-5p | 545.908489 | -1.081474 | 0.398509 | -2.713802 | 6.651588e-03 | 0.047076 |
| 155 | miR-126-3p | 30652.890467 | 1.335901 | 0.493481 | 2.707097 | 6.787448e-03 | 0.047730 |
| 156 | miR-4474-3p | 83.975902 | 0.829953 | 0.307711 | 2.697179 | 6.992957e-03 | 0.048552 |
| 157 | miR-6741-3p | 4287.744769 | -1.358001 | 0.503096 | -2.699285 | 6.948857e-03 | 0.048552 |
| 158 | miR-379-5p | 132.455091 | 1.476587 | 0.548646 | 2.691330 | 7.116772e-03 | 0.049101 |
159 rows × 7 columns
# 11 ML found genes for recurrence were differentially expressed significantly
print('Number of ML and DE genes that overlap (Relapse):')
r_overlap = list(set(de_relapse_genes) & set(r_rf_imp_genes_fil['features'].to_list()))
r_overlap
Number of ML and DE genes that overlap (Relapse):
['miR-6762-3p', 'miR-873-5p', 'miR-6499-5p', 'miR-7846-3p', 'miR-4685-3p', 'miR-4268', 'miR-16-5p', 'miR-671-5p', 'miR-4432', 'miR-194-5p', 'miR-34a-3p']
# Most important gene determined by ML for Relapse
print('Most important gene found by ML (Relapse):', r_rf_imp_genes_fil['features'].to_list()[0:1][0])
Most important gene found by ML (Relapse): miR-6506-3p
# Create a dataframe containing only DE overlap genes for histology
de_histology_df = short_histology_df[de_hist_genes]
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(de_histology_df)
proj_3d = umap_3d.fit_transform(de_histology_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using DE genes (Histology)',
color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using DE genes (Histology)',
color=hist, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
# Create a dataframe containing only DE overlap genes for histology
de_relapse_df = relapse_df[de_relapse_genes]
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(de_relapse_df)
proj_3d = umap_3d.fit_transform(de_relapse_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using DE genes (Relapse)',
color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using DE genes (Relapse)',
color=hist, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(short_histology_df[sh_overlap])
proj_3d = umap_3d.fit_transform(short_histology_df[sh_overlap])
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using Histology Biomarkers',
color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using Histology Biomarkers',
color=hist, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(relapse_df[r_overlap])
proj_3d = umap_3d.fit_transform(relapse_df[r_overlap])
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP Plot using Relapse Biomarkers',
color=hist, hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='UMAP 3D Plot using Relapse Biomarkers',
color=hist, hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()
# Ammar's list of genes
ammar = ['miR-9-5p', 'miR-124-3p', 'miR-137', 'miR-194-5p', 'miR-340-5p']
# Create a dataframe containing only those genes
ammar_df = main_df.filter(ammar)
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ammar_df)
proj_3d = umap_3d.fit_transform(ammar_df)
umap_2d_df = pd.DataFrame(proj_2d, sample_ids).reset_index()
umap_2d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2']
umap_3d_df = pd.DataFrame(proj_3d, sample_ids).reset_index()
umap_3d_df.columns = ['Sample_ID', 'UMAP1', 'UMAP2', 'UMAP3']
fig_2d = px.scatter(umap_2d_df, x='UMAP1', y='UMAP2',
title='UMAP using MIR9‐5p, MIR124‐3p, MIR137, MIR194‐5p, and MIR340‐5p (Histology)',
color=main_df['Short_Histology'].tolist(), hover_data=['Sample_ID'])
fig_3d = px.scatter_3d(umap_3d_df, x='UMAP1', y='UMAP2', z='UMAP3',
title='3D UMAP using MIR9‐5p, MIR124‐3p, MIR137, MIR194‐5p, and MIR340‐5p (Histology)',
color=main_df['Short_Histology'].tolist(), hover_data=['Sample_ID'])
fig_2d.show()
fig_3d.show()